# ! /usr/bin/python2.7
# -*- coding: utf-8 -*-
"""
爬取国家统计局最新地址库
省市区三级（Json版本）
author: gxcuizy
time: 2018-08-24
"""

import requests
import time

from xpinyin import Pinyin
import xlwt
from bs4 import BeautifulSoup


def get_province(index_href):
    """抓取省份信息"""
    province_url = url + index_href
    province_html = requests.get(province_url);
    province_html.encoding = "GBK"
    soup = BeautifulSoup(province_html.text, "html.parser")
    province_tr_list = soup.select('.provincetr a')
    for province_tr in province_tr_list:
        if province_tr.has_attr("href"):
            province_href = province_tr.attrs["href"]
            province_code = province_href.split('.')[0] + "0000"
            province_name = province_tr.text
            level = "100"
            province_data = {}
            province_data.setdefault("code", province_code)
            province_data.setdefault("level", level)
            province_data.setdefault("name", province_name)
            province_data.setdefault("pinyin", pin.get_pinyin(province_name, ""))
            province_data.setdefault("parent", '')
            province_list.append(province_data)
            get_city(province_href, province_code)


def get_city(province_href, province_code):
    """抓取市级城市信息"""
    city_url = url + province_href
    province_html = requests.get(city_url);
    province_html.encoding = "GBK"
    soup = BeautifulSoup(province_html.text, "html.parser")
    city_tr_list = soup.select('.citytr')
    for city_tr in city_tr_list:
        city_info = city_tr.select("a")
        if len(city_info) >= 2:
            city_href = city_info[0].attrs["href"]
            city_code = city_info[0].text[0:6]
            city_name = city_info[1].text
            level = "010"
            province_data = {}
            province_data.setdefault("code", city_code)
            province_data.setdefault("level", level)
            province_data.setdefault("name", city_name)
            province_data.setdefault("pinyin", pin.get_pinyin(city_name, ""))
            province_data.setdefault("parent", province_code)
            province_list.append(province_data)
            get_district(city_href, city_code)


def get_district(city_href, city_code):
    """抓取区级城市信息"""
    district_url = url + city_href
    district_html = requests.get(district_url);
    district_html.encoding = "GBK"
    soup = BeautifulSoup(district_html.text, "html.parser")
    district_str_list = soup.select(".countytr")
    for district_str in district_str_list:
        if len(district_str.select("a")) == 0:
            district_td = district_str.select("td")
            district_code = district_td[0].text[0:6]
            district_name = district_td[1].text
            level = "001"
            district_data = {}
            district_data.setdefault("code", district_code)
            district_data.setdefault("level", level)
            district_data.setdefault("name", district_name)
            district_data.setdefault("pinyin", pin.get_pinyin(district_name, ""))
            district_data.setdefault("parent", city_code)
            province_list.append(district_data)
        district_info = district_str.select("a")
        if len(district_info) >= 2:
            district_code = district_info[0].text[0:6]
            district_name = district_info[1].text
            level = "001"
            district_data = {}
            district_data.setdefault("code", district_code)
            district_data.setdefault("level", level)
            district_data.setdefault("name", district_name)
            district_data.setdefault("pinyin",  pin.get_pinyin(district_name, ""))
            district_data.setdefault("parent", city_code)
            province_list.append(district_data)


# 程序主入口
if __name__ == "__main__":
    pin = Pinyin()
    province_list = []
    init_html = "index.html"
    year = int(time.strftime("%Y")) - 1
    url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/%d/' % (year)
    get_province(init_html)
    workbook = xlwt.Workbook(encoding="UTF-8")
    worksheet = workbook.add_sheet(str(year))
    # worksheet.write(0, 0, "编号")
    # worksheet.write(0, 1, "名称")
    # worksheet.write(0, 2, "层级")
    # worksheet.write(0, 3, "父类编号")
    for x in range(0, len(province_list)):
        worksheet.write(x, 0, province_list[x]["code"])
        worksheet.write(x, 1, province_list[x]["name"])
        worksheet.write(x, 2, province_list[x]["level"])
        worksheet.write(x, 3, province_list[x]["parent"])
        worksheet.write(x, 4, province_list[x]["pinyin"])
    workbook.save("json/" + str(year) + "省市区.xlsx")
